# R script for for replicating supplementary analysis in appendix D & E
# in 'When Parties Move to the Middle: The Role of Uncertainty'
# by J. Lindvall, D. Rueda, and H. Zhai
# this file written by: H Zhai (2022-11-03 [updated: -])
# on device: Mac Pro 13 Dual-Core Intel Core i5 2.3 GHz 

# PLEASE MAKE SURE ALL THE REPLICATION FILES (DATA AND SCRIPTS) ARE STORED AT THE SAME LEVEL IN THE SAME DIRECTORY
# OR MAKE SURE THE DIRECTORY-RELATED CODES ARE PROPERLY ADJUSTED 
# TO ENSURE THE CODES RUN WITHOUT DIRECTORY-RELATED PROBLEMS
# RESTART R SESSION BEFORE RUNNING

# This file replicates results from supplementary analysis reported in appendix sections D and E
# In the order they appear in text

# BEGIN SCRIPT
rm(list = ls())

# pkgs --------------------------------------------------------------------

if (!require("tidyverse")) install.packages("tidyverse")
if (!require("magrittr")) install.packages("magrittr")
if (!require("lmtest")) install.packages("lmtest")
if (!require("sandwich")) install.packages("sandwich")
if (!require("stargazer")) install.packages("stargazer")
if (!require("foreach")) install.packages("foreach")
if (!require("doParallel")) install.packages("doParallel")  
if (!require("broom")) install.packages("broom")
if (!require("interflex")) install.packages("interflex")
if (!require("margins")) install.packages("margins")
if (!require("grf")) install.packages("grf")
if (!require("Hmisc")) install.packages("Hmisc")

# load data ---------------------------------------------------------------

load("RData_data_core.RData") # core sample (n_c=20)
load("RData_data_final.RData") # full sample (n_c=34)

# load outputs (main analysis) --------------------------------------------

# make sure the .RData file is downloaded or `R_1_main_analysis.R` has been run

load("RData_result_main.RData") # model objs

# graphx ------------------------------------------------------------------

theme_base <- theme_get()
theme_set(theme_minimal(base_size = 14))

# Table 4 -----------------------------------------------------------------

data_core %>% 
  select(leftright_mid, leftright_dif, leftright_med_lag, leftright_den_rlog_lag,
         compl_leftright, compr_leftright, leftright_pl_lag, vturn_lag, effpar_ele_lag, 
         gov_party_y3, ud_ipol_y3, #sstran_y3, 
         openc_y3, realgdpgr_y3, unemp_y3) %>% # select key vars
  mutate_at(vars(starts_with("comp")), ~as.numeric(as.character(.))) %>% # coerce factors to numerics
  data.frame() %>% # force to data frame for stargazer
  stargazer(summary.stat = c("n", "mean", "sd", "min", "max"), style = "ajps", digits = 2, 
            title = "Summary statistics of main variables", label = "tab:sumstat")


# Tables 5-6 --------------------------------------------------------------

## define helpers ----
chcse <- function(model) { 
  coeftest(model, vcov. = vcovHC(
    model, type = "HC0", cluster = "countryname"))[,2] # hcse clustered by country (only hc0 works)
}
update_to_log <- function(model, measure = NULL, data = NULL) {
  if (!is.null(measure)) {
    if (measure == "mid") { # dv = joint position
      model1 <- update(model, formula. = leftright_log_mid~.-leftright_den_rlog_lag:leftright_med_lag-leftright_den_rlog_lag-leftright_med_lag+leftright_log_den_rlog_lag*leftright_log_med_lag-
                         compl_leftright+compl_leftright_log-compr_leftright+compr_leftright_log-leftright_pl_lag+leftright_log_pl_lag,
                       data = data)
    }
    else if (measure == "dif") { # dv = distance
      model1 <- update(model, formula. = leftright_log_dif~.-leftright_den_rlog_lag+leftright_log_den_rlog_lag-
                         compl_leftright+compl_leftright_log-compr_leftright+compr_leftright_log-leftright_pl_lag+leftright_log_pl_lag,
                       data = data)
    }
  }
  else {model1 <- model} # no change if no argument supplied
  return(model1)
}

## fit models ----
### Table 5: DV = average position
rob_lm_mid_fe_str_log <- update_to_log(lm_mid_fe_str, measure = "mid", data = data_clean) 
rob_lm_mid_fe_str_log_se <- chcse(rob_lm_mid_fe_str_log)
rob_lm_mid_fe_log <- update_to_log(lm_mid_fe, measure = "mid", data = data_clean) 
rob_lm_mid_fe_log_se <- chcse(rob_lm_mid_fe_log)
rob_lm_mid_ldv_str_log <- update_to_log(lm_mid_ldv_str, measure = "mid", data = data_clean)
rob_lm_mid_ldv_str_log_se <- chcse(rob_lm_mid_ldv_str_log)
rob_lm_mid_ldv_log <- update_to_log(lm_mid_ldv, measure = "mid", data = data_clean)
rob_lm_mid_ldv_log_se <- chcse(rob_lm_mid_ldv_log)
### Table 6: DV = distance
rob_lm_dif_fe_str_log <- update_to_log(lm_dif_fe_str, measure = "dif", data = data_clean)
rob_lm_dif_fe_str_log_se <- chcse(rob_lm_dif_fe_str_log)
rob_lm_dif_fe_log <- update_to_log(lm_dif_fe, measure = "dif", data = data_clean)
rob_lm_dif_fe_log_se <- chcse(rob_lm_dif_fe_log)
rob_lm_dif_ldv_str_log <- update_to_log(lm_dif_ldv_str, measure = "dif", data = data_clean)
rob_lm_dif_ldv_str_log_se <- chcse(rob_lm_dif_ldv_str_log)
rob_lm_dif_ldv_log <- update_to_log(lm_dif_ldv, measure = "dif", data = data_clean)
rob_lm_dif_ldv_log_se <- chcse(rob_lm_dif_ldv_log)

## Table 5 ----
stargazer(rob_lm_mid_fe_str_log, rob_lm_mid_fe_log, rob_lm_mid_ldv_str_log, rob_lm_mid_ldv_log, 
          se = list(rob_lm_mid_fe_str_log_se, rob_lm_mid_fe_log_se, rob_lm_mid_ldv_str_log_se, rob_lm_mid_ldv_log_se),
          column.labels = rep(c("FE", "LDV"), 4), column.sep.width = "0pt", 
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"), 
          df = F, digits = 2, align = T, style = "ajps", notes.label = "Notes: ", notes.align ="l", 
          star.cutoffs = c(0.05, 0.01, 0.001), notes = "Heteroscedasticity-consistent standard errors clustered by country in brackets.") 

## Table 6 ----
stargazer(rob_lm_dif_fe_str_log, rob_lm_dif_fe_log, rob_lm_dif_ldv_str_log, rob_lm_dif_ldv_log,
          se = list(rob_lm_dif_fe_str_log_se, rob_lm_dif_fe_log_se, rob_lm_dif_ldv_str_log_se, rob_lm_dif_ldv_log_se),
          column.labels = rep(c("FE", "LDV"), 4), column.sep.width = "0pt", 
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"), 
          df = F, digits = 2, align = T, style = "ajps", notes.label = "Notes: ", notes.align ="l", 
          star.cutoffs = c(0.05, 0.01, 0.001), notes = "Heteroscedasticity-consistent standard errors clustered by country in brackets.") 

# Figure 7 ----------------------------------------------------------------

# inherits results from Tables 5-6

## estimate CMEs ----
me_mid_fe_log <- margins(rob_lm_mid_fe_log, vcov=vcovHC(rob_lm_mid_fe_log, type="HC0", cluster = "countryname"),
                         data = droplevels(rob_lm_mid_fe_log$model), variables = "leftright_log_med_lag", 
                         at = list(leftright_log_den_rlog_lag = prediction::seq_range(rob_lm_mid_fe_log$model[["leftright_log_den_rlog_lag"]], 10))) # use FE models only

## plot CMEs ----
ggplot(me_mid_fe_log) +
  geom_histogram(data = rob_lm_mid_fe_log$model, aes(leftright_log_den_rlog_lag, ..count../sum(..count..)*10), 
                 bins = 50,  color = "grey", fill = "white", alpha = 0.7) + 
  geom_line(aes(x = leftright_log_den_rlog_lag, y = dydx_leftright_log_med_lag)) +
  geom_line(aes(x = leftright_log_den_rlog_lag, y = dydx_leftright_log_med_lag + 1.96*sqrt(Var_dydx_leftright_log_med_lag)), lty = "dashed") +
  geom_line(aes(x = leftright_log_den_rlog_lag, y = dydx_leftright_log_med_lag - 1.96*sqrt(Var_dydx_leftright_log_med_lag)), lty = "dashed") +  
  geom_point(aes(x = leftright_log_den_rlog_lag, y = dydx_leftright_log_med_lag), shape = 18, size = 2) +
  labs(title = "CME of median voter position on main party average position", subtitle = "Conditional on median voter certainty, FE estimates",
       x = "Lagged Median Voter Position Certainty", y = "CME of Lagged Median Voter Position") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) 

# Figure 8 ----------------------------------------------------------------

## preprocess data ----
data_interflex <- data_clean # set aside new dataset
data_interflex$compl_leftright <- as.factor(data_interflex$compl_leftright)
data_interflex$compr_leftright <- as.factor(data_interflex$compr_leftright)
z_lr <- c("compl_leftright", "compr_leftright", 
          "vturn_lag", "leftright_pl_lag", "effpar_ele_lag", 
          "gov_party_y3", "ud_ipol_y3", "openc_y3", "realgdpgr_y3", "unemp_y3")

## run model (turned off by default) ----
run_kernel_reg <- FALSE # takes some time. not run unless needed.
if (run_kernel_reg) {
  set.seed(123)
  kerreg_fe <- interflex(
    estimator = "kernel",
    data = as.data.frame(data_interflex), 
    Y = "leftright_mid", D = "leftright_med_lag", X = "leftright_den_rlog_lag", 
    treat.type = "continuous",
    Z = z_lr, FE = c("countryname", "eyear"),
    xlab = "Lagged Median Voter Certainty", ylab = "CME of Lagged Median Voter Position",
    main = "Kernel regression results, FE estimates", 
    theme.bw = T, na.rm = T)
}

## plot CMEs ----
if (run_kernel_reg) kerreg_fe$figure 

# Table 7 -----------------------------------------------------------------

## preprocess data ----
### variable IDs
y1 <- "leftright_mid"
y2 <- "leftright_dif"
d1 <- "leftright_med_lag"
d2 <- "leftright_den_rlog_lag"
xx <- c("leftright_den_rlog_lag",
        "compl_leftright", "compr_leftright", 
        "leftright_pl_lag", "vturn_lag", "effpar_ele_lag", 
        "gov_party_y3", "ud_ipol_y3", #"sstran_y3", 
        "openc_y3", "realgdpgr_y3", "unemp_y3")
cl <- "country"
### clean data
convert_fct <- function(f) {
  if (is.factor(f)) as.numeric(levels(f))[f]
  else f
}
df <- # data_clean %>% 
  data_core %>% 
  select(any_of(c(y1,y2,xx,d1,d2,cl))) %>% 
  mutate(across(.fns = convert_fct)) %>% 
  na.omit()

## split train-test sets ----
set.seed(123)
train.id <- sample(seq_len(nrow(df)), round(nrow(df) * .8))
train <- df[train.id,]
test <- df[-train.id, ]

## train grfs ----
n.trees <- 1e4
seed <- 123
r1 <- causal_forest(X = as.matrix(train[,xx]), 
                    Y = as.matrix(train[,y1]), W = as.matrix(train[,d1]), 
                    clusters = unlist(train[,cl]), 
                    num.trees = n.trees, seed = seed)
r2 <- causal_forest(X = as.matrix(train[,xx[-1]]), 
                    Y = as.matrix(train[,y2]), W = as.matrix(train[,d2]), 
                    clusters = unlist(train[,cl]),
                    num.trees = n.trees, seed = seed)
r3 <- causal_forest(X = as.matrix(train[,xx]), 
                    Y = as.matrix(train[,y1]), W = as.matrix(train[,d2]), 
                    clusters = unlist(train[,cl]), 
                    num.trees = n.trees, seed = seed)

## get results (table figures) ----
average_treatment_effect(r1) 
# estimate   std.err 
# 0.2897248 0.1311686
average_treatment_effect(r2)
# estimate     std.err 
# -0.21479808  0.04176852
average_treatment_effect(r3)
# estimate     std.err 
# -0.01727134  0.24509099 

# Figures 9-10 ------------------------------------------------------------

# inherits outputs from Table 7

## predict CMEs ----
preds1 <- predict(r1, newdata = as.matrix(test[,xx]), estimate.variance = TRUE)
preds2 <- predict(r2, newdata = as.matrix(test[,xx[-1]]), estimate.variance = TRUE)
preds1x <- cbind.data.frame(test, cate_hat = preds1[,1], cate_se_hat = preds1[,2])
preds2x <- cbind.data.frame(test, cate_hat = preds2[,1], cate_se_hat = preds2[,2])

## Figure 9 ----
ggplot(preds1x, aes(leftright_den_rlog_lag, cate_hat)) +
  geom_pointrange(aes(ymin = cate_hat-1.96*cate_se_hat, ymax = cate_hat+1.96*cate_se_hat),
                  alpha = 0.7, color = "grey20", size = 0.3, shape = 18, fatten = 10) +
  geom_smooth(method = "lm", color = "gray40", fill = "gray60", size = 0.8, alpha = 0.3, lty = "dashed") +
  geom_rug(sides = "b", alpha = 0.5, length = unit(0.03, "npc")) +
  labs(title = "Robustness check III: CATE of median voter position on main party average position", 
       subtitle = "Conditional on median voter certainty, GRF out-of-sample estimate",
       x = "Lagged Median Voter Certainty",  y = "CME of Lagged Median Voter Position")

## Figure 10 ----
data.frame(
  var_name = colnames(train[,xx]),
  var_imp = variable_importance(r1)
) %>% 
  mutate(var_name = case_when(
    var_name=="leftright_den_rlog_lag" ~ "Median Certainty",
    var_name=="compl_leftright" ~ "Left Competitor",
    var_name=="compr_leftright" ~ "Right Competitor",
    var_name=="leftright_pl_lag" ~ "Party Polarization",
    var_name=="vturn_lag" ~ "Voter Turnout",
    var_name=="effpar_ele_lag" ~ "Effective N. Parties",
    var_name=="gov_party_y3" ~ "Govt. Partisanship",
    var_name=="ud_ipol_y3" ~ "Union Density",
    var_name=="openc_y3" ~ "Trade Openness",
    var_name=="realgdpgr_y3" ~ "GDP Growth",
    var_name=="unemp_y3" ~ "Unemployment Rate",
  )) %>% 
  mutate(is_main = ifelse(var_name=="Median Certainty", 1, 0)) %>% 
  ggplot(aes(reorder(var_name, var_imp), var_imp)) +
  geom_col(aes(fill=factor(is_main)), show.legend = FALSE) +
  scale_fill_grey(start = 0.7, end = 0.2) +
  scale_x_discrete(labels = scales::wrap_format(width = 5)) +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Robustness check III: Variable importance of all covariates", x = NULL, y = "Var. Importance") 

# Figure 11 ---------------------------------------------------------------

## setup parallel runs ----
no_cores <- detectCores() - 1 
cl <- makeCluster(no_cores, type="FORK")  
registerDoParallel(cl)  # cluster setup

## fit models ----
### Panel A: DV = Average Position
foreach(cname = unique(data_core$countryname), .combine = 'rbind') %:%
  foreach(model = list(lm_mid_fe, lm_mid_ldv), .combine = 'rbind') %dopar% {
    model1 <- update(model, data = filter(data_core, countryname != cname)) # drop one country 
    se <- chcse(model = model1) # get hcse
    requireNamespace("broom")
    out <- 
      tidy(model1) %>% mutate(std.error = se) %>% # change se to hcse
      filter(term == "leftright_den_rlog_lag:leftright_med_lag") %>% 
      mutate(term = recode(term, "leftright_den_rlog_lag:leftright_med_lag" = "Position X Certainty")) %>% # rename interaction term
      add_column(excluded = cname, # add dropped country id
                 model_type = ifelse("countryname" %in% all.vars(formula(model1)), "FE", "LDV")) # add model type
  } -> jackout_mid
### Panel B: DV = Distance
foreach(cname = unique(data_core$countryname), .combine = 'rbind') %:%
  foreach(model = list(lm_dif_fe, lm_dif_ldv), .combine = 'rbind') %dopar% {
    model1 <- update(model, data = filter(data_core, countryname != cname)) # drop one country 
    se <- chcse(model = model1) # get hcse
    requireNamespace("broom")
    out <- 
      tidy(model1) %>% mutate(std.error = se) %>% # change se to hcse
      filter(term == "leftright_den_rlog_lag") %>% 
      mutate(term = recode(term, "leftright_den_rlog_lag" = "Certainty")) %>% # rename key term
      add_column(excluded = cname, # add dropped country id
                 model_type = ifelse("countryname" %in% all.vars(formula(model1)), "FE", "LDV")) # add model type
  } -> jackout_dif

## stop parallels ----
parallel::stopCluster(cl) # stop cluster

## Panel A: DV = Average Position ----
ggplot(jackout_mid, aes(excluded, estimate, color = model_type)) + 
  geom_point(position = position_dodge(width = 0.7)) +
  geom_errorbar(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error), 
                position = position_dodge(width = 0.7)) + 
  geom_hline(yintercept = 0, lty = "dashed") +
  scale_color_grey(start = 0.2, end = 0.6) +
  coord_flip() +
  theme(axis.ticks.y = element_blank()) +
  labs(title = "DV = Average Position, X = Median Position x Certainty", 
       x = "", y = expression(hat(beta)), color = "Model Type")

## Panel B: DV = Distance ----
ggplot(jackout_dif, aes(excluded, estimate, color = model_type)) + 
  geom_point(position = position_dodge(width = 0.7)) +
  geom_errorbar(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error), position = position_dodge(width = 0.7)) + 
  geom_hline(yintercept = 0, lty = "dashed") +
  scale_color_grey(start = 0.2, end = 0.6) +
  coord_flip() +
  theme(axis.ticks.y = element_blank()) +
  labs(title = "DV = Distance, X = Median Certainty", 
       x = "", y = expression(hat(beta)), color = "Model Type")

# Figure 12 ---------------------------------------------------------------

## split data by period ----
data_core_pre90 <- # 1965-1990
  data_core %>% 
  mutate(eyear1 = as.numeric(as.character(eyear))) %>% 
  filter(eyear1 <= 1990)
data_core_post90 <- # 1991-2018
  data_core %>% 
  mutate(eyear1 = as.numeric(as.character(eyear))) %>% 
  filter(eyear1 >= 1991)

## fit models by period ----

### 1965-1990
#### DV = average distance
lm_mid_fe_str_pre <- update(lm_mid_fe_str, data = data_core_pre90)
lm_mid_fe_str_pre_se <- chcse(lm_mid_fe_str_pre)
lm_mid_fe_pre <- update(lm_mid_fe, data = data_core_pre90)
lm_mid_fe_pre_se <- chcse(lm_mid_fe_pre)
lm_mid_ldv_str_pre <- update(lm_mid_ldv_str, data = data_core_pre90)
lm_mid_ldv_str_pre_se <- chcse(lm_mid_ldv_str_pre)
lm_mid_ldv_pre <- update(lm_mid_ldv, data = data_core_pre90)
lm_mid_ldv_pre_se <- chcse(lm_mid_ldv_pre)
#### DV = distance
lm_dif_fe_str_pre <- update(lm_dif_fe_str, data = data_core_pre90)
lm_dif_fe_str_pre_se <- chcse(lm_dif_fe_str_pre)
lm_dif_fe_pre <- update(lm_dif_fe, data = data_core_pre90)
lm_dif_fe_pre_se <- chcse(lm_dif_fe_pre)
lm_dif_ldv_str_pre <- update(lm_dif_ldv_str, data = data_core_pre90)
lm_dif_ldv_str_pre_se <- chcse(lm_dif_ldv_str_pre)
lm_dif_ldv_pre <- update(lm_dif_ldv, data = data_core_pre90)
lm_dif_ldv_pre_se <- chcse(lm_dif_ldv_pre)

### 1991-2018
#### DV = average position
lm_mid_fe_str_post <- update(lm_mid_fe_str, data = data_core_post90)
lm_mid_fe_str_post_se <- chcse(lm_mid_fe_str_post)
lm_mid_fe_post <- update(lm_mid_fe, data = data_core_post90)
lm_mid_fe_post_se <- chcse(lm_mid_fe_post)
lm_mid_ldv_str_post <- update(lm_mid_ldv_str, data = data_core_post90)
lm_mid_ldv_str_post_se <- chcse(lm_mid_ldv_str_post)
lm_mid_ldv_post <- update(lm_mid_ldv, data = data_core_post90)
lm_mid_ldv_post_se <- chcse(lm_mid_ldv_post)
#### DV = distance
lm_dif_fe_str_post <- update(lm_dif_fe_str, data = data_core_post90)
lm_dif_fe_str_post_se <- chcse(lm_dif_fe_str_post)
lm_dif_fe_post <- update(lm_dif_fe, data = data_core_post90)
lm_dif_fe_post_se <- chcse(lm_dif_fe_post)
lm_dif_ldv_str_post <- update(lm_dif_ldv_str, data = data_core_post90)
lm_dif_ldv_str_post_se <- chcse(lm_dif_ldv_str_post)
lm_dif_ldv_post <- update(lm_dif_ldv, data = data_core_post90)
lm_dif_ldv_post_se <- chcse(lm_dif_ldv_post)

## collect results ----

### parameter labels 
ID_models <- c("FE", "LDV")
ID_time <- c("1965-1990", "1991-2018")
ID_out <- 
  expand.grid(ID_models, ID_time) %$% 
  paste(.[[1]], .[[2]], sep = ",")

### DV = average position
bhat_lm_mid <- 
  map(list(lm_mid_fe_pre, lm_mid_ldv_pre, lm_mid_fe_post, lm_mid_ldv_post),
      ~{coef(.x)[["leftright_den_rlog_lag:leftright_med_lag"]]}) %>% 
  `names<-`(ID_out)
se_lm_mid <- 
  map(list(lm_mid_fe_pre_se, lm_mid_ldv_pre_se, lm_mid_fe_post_se, lm_mid_ldv_post_se),
      ~{.x[["leftright_den_rlog_lag:leftright_med_lag"]]}) %>% 
  `names<-`(ID_out)
df_lm_mid <- 
  map2(.x = list(bhat_lm_mid,se_lm_mid), 
       .y = c("bhat", "se"),
       .f = ~{
         bind_rows(.x, .id = "model") %>%
           pivot_longer(cols = everything(), names_to = "model0", values_to = .y)}) %$% 
  full_join(.[[1]], .[[2]], by = "model0") %>% 
  mutate(lwr = bhat - 1.96*se, upr = bhat + 1.96*se) %>% 
  separate(col = "model0", into = c("model", "time"), sep = ",")

### DV = distance
bhat_lm_dif <- 
  map(list(lm_dif_fe_pre, lm_dif_ldv_pre, lm_dif_fe_post, lm_dif_ldv_post),
      ~{coef(.x)[["leftright_den_rlog_lag"]]}) %>% 
  `names<-`(ID_out)
se_lm_dif <- 
  map(list(lm_dif_fe_pre_se, lm_dif_ldv_pre_se, lm_dif_fe_post_se, lm_dif_ldv_post_se),
      ~{.x[["leftright_den_rlog_lag"]]}) %>% 
  `names<-`(ID_out)
df_lm_dif <- 
  map2(.x = list(bhat_lm_dif,se_lm_dif), 
       .y = c("bhat", "se"),
       .f = ~{
         bind_rows(.x, .id = "model") %>%
           pivot_longer(cols = everything(), names_to = "model0", values_to = .y)}) %$% 
  full_join(.[[1]], .[[2]], by = "model0") %>% 
  mutate(lwr = bhat - 1.96*se, upr = bhat + 1.96*se) %>% 
  separate(col = "model0", into = c("model", "time"), sep = ",")

## Panel A: DV = Average Position ----
ggplot(df_lm_mid, aes(x = time, y = bhat, ymin = lwr, ymax = upr, color = model, shape = model)) +
  geom_pointrange(position = position_dodge(width = 0.7), fatten = 5) +
  geom_hline(yintercept = 0, lty = "dashed", size = 0.7) +
  scale_color_grey(start = 0.2, end = 0.6) +
  labs(title = "DV = Average Position, X = Median Position x Certainty",
       x = NULL, y = expression(hat(beta)), color = "Model Type", shape = "Model Type")

## Panel B: DV = Distance ----
ggplot(df_lm_dif, aes(x = time, y = bhat, ymin = lwr, ymax = upr, color = model, shape = model)) +
  geom_pointrange(position = position_dodge(width = 0.7), fatten = 5) +
  geom_hline(yintercept = 0, lty = "dashed", size = 0.7) +
  scale_color_grey(start = 0.2, end = 0.6) +
  labs(title = "DV = Distance, X = Median Certainty",
       x = NULL, y = expression(hat(beta)), color = "Model Type", shape = "Model Type")

# Figure 13 ---------------------------------------------------------------

## preprocess data ----
data_rollwind <- # set aside new dataset 
  data_clean %>% mutate(eyear_num = as.numeric(substring(edate,1,4))) %>% # add numeric year
  group_by(country) %>% 
  filter(min(eyear_num) %in% 1945:1950 & max(eyear_num) %in% 2015:2018) %>% # select 1945-2015 range 
  ungroup()
year_range <- seq(min(data_rollwind$eyear_num), max(data_rollwind$eyear_num)-50, 1) # 50-year span

## setup parallel runs ----
no_cores <- detectCores() - 1 
cl <- makeCluster(no_cores, type="FORK")  
registerDoParallel(cl)  # cluster setup

## fit models ----
### Panel A: DV = Average Position
foreach(yearmin = year_range, .combine = 'rbind') %:%
  foreach(model = list(lm_mid_fe, lm_mid_ldv), .combine = 'rbind') %dopar% {
    model1 <- update(model, data = filter(data_rollwind, eyear_num >= yearmin & eyear_num <= yearmin+50)) # apply rolling window
    se <- chcse(model = model1) # get hcse
    requireNamespace("broom")
    out <- 
      tidy(model1) %>% mutate(std.error = se) %>% # change se to hcse
      filter(term == "leftright_den_rlog_lag:leftright_med_lag") %>% 
      mutate(term = recode(term, "leftright_den_rlog_lag:leftright_med_lag" = "Position X Certainty")) %>% # rename interaction term
      add_column(period = paste0(as.character(yearmin), "-", as.character(yearmin+50)), # add period 
                 model_type = ifelse("countryname" %in% all.vars(formula(model1)), "FE", "LDV")) # add model type
  } -> rollwind_mid
### Panel B: DV = Distance
foreach(yearmin = year_range, .combine = 'rbind') %:%
  foreach(model = list(lm_dif_fe, lm_dif_ldv), .combine = 'rbind') %dopar% {
    model1 <- update(model, data = filter(data_rollwind, eyear_num >= yearmin & eyear_num <= yearmin+50)) # apply rolling window
    se <- chcse(model = model1) # get hcse
    requireNamespace("broom")
    out <- 
      tidy(model1) %>% mutate(std.error = se) %>% # change se to hcse
      filter(term == "leftright_den_rlog_lag") %>% 
      mutate(term = recode(term, "leftright_den_rlog_lag" = "Certainty")) %>% # rename key term
      add_column(period = paste0(as.character(yearmin), "-", as.character(yearmin+50)), # add period 
                 model_type = ifelse("countryname" %in% all.vars(formula(model1)), "FE", "LDV")) # add model type
  } -> rollwind_dif

## stop parallels ----
parallel::stopCluster(cl) # stop cluster

## Panel A: DV = Average Position ----
ggplot(rollwind_mid, aes(period, estimate, color = model_type)) + 
  geom_point(position = position_dodge(width = 0.7), size = 3.5) +
  geom_errorbar(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error), 
                position = position_dodge(width = 0.7)) + 
  geom_hline(yintercept = 0, lty = "dashed") +
  scale_color_grey(start = 0.2, end = 0.6) +
  theme(axis.ticks.y = element_blank(), 
        axis.text.x = element_text(angle = 30, hjust = 1), 
        axis.title.y = element_text(angle = 0, vjust = 0.5)) +
  labs(title = "DV = Average Position, X = Median Position x Certainty", 
       x = "", y = expression(hat(beta)), color = "Model Type")

## Panel B: DV = Distance ----
ggplot(rollwind_dif, aes(period, estimate, color = model_type)) + 
  geom_point(position = position_dodge(width = 0.7), size = 3.5) +
  geom_errorbar(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error), 
                position = position_dodge(width = 0.7)) + 
  geom_hline(yintercept = 0, lty = "dashed") +
  scale_color_grey(start = 0.2, end = 0.6) +
  theme(axis.ticks.y = element_blank(), 
        axis.text.x = element_text(angle = 30, hjust = 1), 
        axis.title.y = element_text(angle = 0, vjust = 0.5)) +
  labs(title = "DV = Distance, X = Median Certainty", 
       x = "", y = expression(hat(beta)), color = "Model Type")

# Tables 8-9 --------------------------------------------------------------

## add dispersion measure ----
load("RData_measure_voter.RData")
load("RData_manifesto_oecd.RData")
load("RData_measure_scale.RData")
measure_data <- select(manifesto_oecd, country, countryname, edate, eyear, pervote) %>% # select election vars
  cbind.data.frame(leftright, leftright_log) %>% # add policy scales 
  drop_na() 
measure_voter_disp <- 
  measure_data %>%
  group_by(country, countryname, edate, eyear) %>% 
  summarise(leftright_disp = sqrt(wtd.var(leftright, weights = pervote)),
            leftright_log_disp = sqrt(wtd.var(leftright_log, weights = pervote))) %>% # sd(voter) at t
  ungroup(edate, eyear) %>% # drop time id
  mutate(leftright_disp_delta = leftright_disp - dplyr::lag(leftright_disp),
         leftright_log_disp_delta = leftright_log_disp - dplyr::lag(leftright_log_disp)) %>% # first diff
  transform(leftright_disp_delta = scales::rescale(leftright_disp_delta, to = c(0,100)),
            leftright_log_disp_delta = scales::rescale(leftright_log_disp_delta, to = c(0,100))) %>% # rescale to 0-100
  filter(eyear>=1945)
data_uncertainty_disp <- measure_voter_disp %>% 
  select(country, countryname, edate, eyear, leftright_disp_delta)
data_add_uncertainty <- left_join(
  data_clean, data_uncertainty_disp %>% 
    transform(country = as.factor(country), eyear = as.factor(eyear)) # make id types compatible
)

## define helper (add new ctrl) ----
update_add_uncertainty <- function(x) {
  update(x, formula. = .~.+leftright_disp_delta, data = data_add_uncertainty)
}

## fit models ----

### DV = average position
rob_lm_mid_fe_str_disp <- update_add_uncertainty(lm_mid_fe_str)
rob_lm_mid_fe_str_disp_se <- chcse(rob_lm_mid_fe_str_disp)
rob_lm_mid_fe_disp <- update_add_uncertainty(lm_mid_fe)
rob_lm_mid_fe_disp_se <- chcse(rob_lm_mid_fe_disp)
rob_lm_mid_ldv_str_disp <- update_add_uncertainty(lm_mid_ldv_str)
rob_lm_mid_ldv_str_disp_se <- chcse(rob_lm_mid_ldv_str_disp)
rob_lm_mid_ldv_disp <- update_add_uncertainty(lm_mid_ldv)
rob_lm_mid_ldv_disp_se <- chcse(rob_lm_mid_ldv_disp)
### DV = distance
rob_lm_dif_fe_str_disp <- update_add_uncertainty(lm_dif_fe_str)
rob_lm_dif_fe_str_disp_se <- chcse(rob_lm_dif_fe_str_disp)
rob_lm_dif_fe_disp <- update_add_uncertainty(lm_dif_fe)
rob_lm_dif_fe_disp_se <- chcse(rob_lm_dif_fe_disp)
rob_lm_dif_ldv_str_disp <- update_add_uncertainty(lm_dif_ldv_str)
rob_lm_dif_ldv_str_disp_se <- chcse(rob_lm_dif_ldv_str_disp)
rob_lm_dif_ldv_disp <- update_add_uncertainty(lm_dif_ldv)
rob_lm_dif_ldv_disp_se <- chcse(rob_lm_dif_ldv_disp)

## Table 8: DV = Average Position ----
stargazer(rob_lm_mid_fe_str_disp, rob_lm_mid_fe_disp, rob_lm_mid_ldv_str_disp, rob_lm_mid_ldv_disp,
          se = list(rob_lm_mid_fe_str_disp_se, rob_lm_mid_fe_disp_se, rob_lm_mid_ldv_str_disp_se, rob_lm_mid_ldv_disp_se),
          column.labels = rep(c("FE", "LDV"), 4), column.sep.width = "0pt", 
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"), 
          df = F, digits = 2, align = T, style = "ajps", notes.label = "Notes: ", notes.align ="l", 
          star.cutoffs = c(0.05, 0.01, 0.001), notes = "Heteroscedasticity-consistent standard errors clustered by country in brackets.") 

## Table 9: DV = Distance ----
stargazer(rob_lm_dif_fe_str_disp, rob_lm_dif_fe_disp, rob_lm_dif_ldv_str_disp, rob_lm_dif_ldv_disp,
          se = list(rob_lm_dif_fe_str_disp_se, rob_lm_dif_fe_disp_se, rob_lm_dif_ldv_str_disp_se, rob_lm_dif_ldv_disp_se),
          column.labels = rep(c("FE", "LDV"), 4), column.sep.width = "0pt", 
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"), 
          df = F, digits = 2, align = T, style = "ajps", notes.label = "Notes: ", notes.align ="l", 
          star.cutoffs = c(0.05, 0.01, 0.001), notes = "Heteroscedasticity-consistent standard errors clustered by country in brackets.") 

# Tables 10-13 ------------------------------------------------------------

## add electoral system measures ----
cpds_elec <- haven::read_dta("CPDS-1960-2017-Update-2019.dta") %>% # read cpds data
  mutate(country = ifelse(country=="USA", "United States", country)) %>% # correct usa countryname
  select(countryname = country, eyear = year, prop) # select variables & rename for merging
data_clean <- data_clean %>% 
  mutate(eyear = as.numeric(levels(eyear))[eyear]) %>% # convert eyear back to numeric (for merging)
  left_join(., cpds_elec) %>% # merge
  mutate(eyear = factor(eyear), prop = factor(prop)) # make eyear, prop factor

## fit models ----

### X(PR) = categorical var (dummies)
#### DV = average position
lm_mid_fe_str_ele_pr <- update(lm_mid_fe_str, formula. = .~.+leftright_med_lag*leftright_den_rlog_lag*prop, data = data_clean)
lm_mid_fe_str_ele_pr_cse <- chcse(lm_mid_fe_str_ele_pr)
lm_mid_fe_ele_pr <- update(lm_mid_fe, formula. = .~.+leftright_med_lag*leftright_den_rlog_lag*prop, data = data_clean)
lm_mid_fe_ele_pr_cse <- chcse(lm_mid_fe_ele_pr)
lm_mid_ldv_str_ele_pr <- update(lm_mid_ldv_str, formula. = .~.+leftright_med_lag*leftright_den_rlog_lag*prop, data = data_clean)
lm_mid_ldv_str_ele_pr_cse <- chcse(lm_mid_ldv_str_ele_pr)
lm_mid_ldv_ele_pr <- update(lm_mid_ldv, formula. = .~.+leftright_med_lag*leftright_den_rlog_lag*prop, data = data_clean)
lm_mid_ldv_ele_pr_cse <- chcse(lm_mid_ldv_ele_pr)
#### DV = distance
lm_dif_fe_str_ele_pr <- update(lm_dif_fe_str, formula. = .~.+leftright_den_rlog_lag*prop, data = data_clean)
lm_dif_fe_str_ele_pr_cse <- chcse(lm_dif_fe_str_ele_pr)
lm_dif_fe_ele_pr <- update(lm_dif_fe, formula. = .~.+leftright_den_rlog_lag*prop, data = data_clean)
lm_dif_fe_ele_pr_cse <- chcse(lm_dif_fe_ele_pr)
lm_dif_ldv_str_ele_pr <- update(lm_dif_ldv_str, formula. = .~.+leftright_den_rlog_lag*prop, data = data_clean)
lm_dif_ldv_str_ele_pr_cse <- chcse(lm_dif_ldv_str_ele_pr)
lm_dif_ldv_ele_pr <- update(lm_dif_ldv, formula. = .~.+leftright_den_rlog_lag*prop, data = data_clean)
lm_dif_ldv_ele_pr_cse <- chcse(lm_dif_ldv_ele_pr)

### X(PR) = continuous scale
#### DV = average position
lm_mid_fe_str_ele_effpar <- update(lm_mid_fe_str, formula. = .~.+leftright_med_lag*leftright_den_rlog_lag*effpar_ele_lag, data = data_clean)
lm_mid_fe_str_ele_effpar_cse <- chcse(lm_mid_fe_str_ele_effpar)
lm_mid_fe_ele_effpar <- update(lm_mid_fe, formula. = .~.+leftright_med_lag*leftright_den_rlog_lag*effpar_ele_lag, data = data_clean)
lm_mid_fe_ele_effpar_cse <- chcse(lm_mid_fe_ele_effpar)
lm_mid_ldv_str_ele_effpar <- update(lm_mid_ldv_str, formula. = .~.+leftright_med_lag*leftright_den_rlog_lag*effpar_ele_lag, data = data_clean)
lm_mid_ldv_str_ele_effpar_cse <- chcse(lm_mid_ldv_str_ele_effpar)
lm_mid_ldv_ele_effpar <- update(lm_mid_ldv, formula. = .~.+leftright_med_lag*leftright_den_rlog_lag*effpar_ele_lag, data = data_clean)
lm_mid_ldv_ele_effpar_cse <- chcse(lm_mid_ldv_ele_effpar)
#### DV = distance
lm_dif_fe_str_ele_effpar <- update(lm_dif_fe_str, formula. = .~.+leftright_den_rlog_lag*effpar_ele_lag, data = data_clean)
lm_dif_fe_str_ele_effpar_cse <- chcse(lm_dif_fe_str_ele_effpar)
lm_dif_fe_ele_effpar <- update(lm_dif_fe, formula. = .~.+leftright_den_rlog_lag*effpar_ele_lag, data = data_clean)
lm_dif_fe_ele_effpar_cse <- chcse(lm_dif_fe_ele_effpar)
lm_dif_ldv_str_ele_effpar <- update(lm_dif_ldv_str, formula. = .~.+leftright_den_rlog_lag*effpar_ele_lag, data = data_clean)
lm_dif_ldv_str_ele_effpar_cse <- chcse(lm_dif_ldv_str_ele_effpar)
lm_dif_ldv_ele_effpar <- update(lm_dif_ldv, formula. = .~.+leftright_den_rlog_lag*effpar_ele_lag, data = data_clean)
lm_dif_ldv_ele_effpar_cse <- chcse(lm_dif_ldv_ele_effpar)

## Table 10: DV = average position, X = categorical measure ----
stargazer(list(lm_mid_fe_str_ele_pr, lm_mid_fe_ele_pr, lm_mid_ldv_str_ele_pr, lm_mid_ldv_ele_pr), 
          se = list(lm_mid_fe_str_ele_pr_cse, lm_mid_fe_ele_pr_cse, lm_mid_ldv_str_ele_pr_cse, lm_mid_ldv_ele_pr_cse), 
          digits = 2, align = T, star.cutoffs = c(0.05, 0.01, 0.001), df = F,
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"))

## Table 11: DV = distance, X = categorical measure ----
stargazer(list(lm_dif_fe_str_ele_pr, lm_dif_fe_ele_pr, lm_dif_ldv_str_ele_pr, lm_dif_ldv_ele_pr), 
          se = list(lm_dif_fe_str_ele_pr_cse, lm_dif_fe_ele_pr_cse, lm_dif_ldv_str_ele_pr_cse, lm_dif_ldv_ele_pr_cse), 
          digits = 2, align = T, star.cutoffs = c(0.05, 0.01, 0.001), df = F,
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"))

## Table 12: DV = average position, X = continuous measure ----
stargazer(list(lm_mid_fe_str_ele_effpar, lm_mid_fe_ele_effpar, lm_mid_ldv_str_ele_effpar, lm_mid_ldv_ele_effpar), 
          se = list(lm_mid_fe_str_ele_effpar_cse, lm_mid_fe_ele_effpar_cse, lm_mid_ldv_str_ele_effpar_cse, lm_mid_ldv_ele_effpar_cse), 
          digits = 2, align = T, star.cutoffs = c(0.05, 0.01, 0.001), df = F,
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"))

## Table 13: DV = distance, X = continuous measure ----
stargazer(list(lm_dif_fe_str_ele_effpar, lm_dif_fe_ele_effpar, lm_dif_ldv_str_ele_effpar, lm_dif_ldv_ele_effpar), 
          se = list(lm_dif_fe_str_ele_effpar_cse, lm_dif_fe_ele_effpar_cse, lm_dif_ldv_str_ele_effpar_cse, lm_dif_ldv_ele_effpar_cse), 
          digits = 2, align = T, star.cutoffs = c(0.05, 0.01, 0.001), df = F,
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"))

# Tables 14-15 ------------------------------------------------------------

## add time lapse measure ----
data_interelec <- data_clean %>% 
  group_by(country, countryname) %>% 
  arrange(edate) %>% 
  mutate(interelect = as.numeric((edate - lag(edate))/30)) %>% # time diff in months
  arrange(country) 

## fit models ----

### DV = average position 
interelect_lm_mid_fe_str <- update(lm_mid_fe_str, .~.+leftright_med_lag*leftright_den_rlog_lag*interelect, data = data_interelec)
interelect_lm_mid_fe_str_cse <- chcse(interelect_lm_mid_fe_str)
interelect_lm_mid_fe <- update(lm_mid_fe, .~.+leftright_med_lag*leftright_den_rlog_lag*interelect, data = data_interelec)
interelect_lm_mid_fe_cse <- chcse(interelect_lm_mid_fe)
interelect_lm_mid_ldv_str <- update(lm_mid_ldv_str, .~.+leftright_med_lag*leftright_den_rlog_lag*interelect, data = data_interelec)
interelect_lm_mid_ldv_str_cse <- chcse(interelect_lm_mid_ldv_str)
interelect_lm_mid_ldv <- update(lm_mid_ldv, .~.+leftright_med_lag*leftright_den_rlog_lag*interelect, data = data_interelec)
interelect_lm_mid_ldv_cse <- chcse(interelect_lm_mid_ldv)
### DV = distance
interelect_lm_dif_fe_str <- update(lm_dif_fe_str, .~.+leftright_den_rlog_lag*interelect, data = data_interelec)
interelect_lm_dif_fe_str_cse <- chcse(interelect_lm_dif_fe_str)
interelect_lm_dif_fe <- update(lm_dif_fe, .~.+leftright_den_rlog_lag*interelect, data = data_interelec)
interelect_lm_dif_fe_cse <- chcse(interelect_lm_dif_fe)
interelect_lm_dif_ldv_str <- update(lm_dif_ldv_str, .~.+leftright_den_rlog_lag*interelect, data = data_interelec)
interelect_lm_dif_ldv_str_cse <- chcse(interelect_lm_dif_ldv_str)
interelect_lm_dif_ldv <- update(lm_dif_ldv, .~.+leftright_den_rlog_lag*interelect, data = data_interelec)
interelect_lm_dif_ldv_cse <- chcse(interelect_lm_dif_ldv)

## Table 14: DV = Average Position ----
stargazer(interelect_lm_mid_fe_str, interelect_lm_mid_fe, interelect_lm_mid_ldv_str, interelect_lm_mid_ldv, 
          se = list(interelect_lm_mid_fe_str_cse, interelect_lm_mid_fe_cse, interelect_lm_mid_ldv_str_cse, interelect_lm_mid_ldv_cse),
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"), 
          column.labels = rep(c("FE", "ldv"), 4), column.sep.width = "0pt", 
          df = F, digits = 2, align = T, style = "ajps", notes.label = "Notes: ", notes.align ="l", 
          star.cutoffs = c(0.05, 0.01, 0.001), notes = "Heteroscedasticity-consistent standard errors clustered by country in brackets.") 

## Table 15: DV = Distance ----
stargazer(interelect_lm_dif_fe_str, interelect_lm_dif_fe, interelect_lm_dif_ldv_str, interelect_lm_dif_ldv, 
          se = list(interelect_lm_dif_fe_str_cse, interelect_lm_dif_fe_cse, interelect_lm_dif_ldv_str_cse, interelect_lm_dif_ldv_cse),
          omit = c("countryname", "eyear"), omit.labels = c("Country FE", "Year FE"), 
          column.labels = rep(c("FE", "ldv"), 4), column.sep.width = "0pt", 
          df = F, digits = 2, align = T, style = "ajps", notes.label = "Notes: ", notes.align ="l", 
          star.cutoffs = c(0.05, 0.01, 0.001), notes = "Heteroscedasticity-consistent standard errors clustered by country in brackets.") 

# Figure 14 ---------------------------------------------------------------

## estimate CMEs ----
me_dif_fe_intelec <- margins(
  interelect_lm_dif_fe, vcov=vcovHC(interelect_lm_dif_fe, type="HC0", cluster = "countryname"),
  data = droplevels(interelect_lm_dif_fe$model), variables = "leftright_den_rlog_lag",
  at = list(interelect = prediction::seq_range(interelect_lm_dif_fe$model[["interelect"]], 10)))

## plot CMEs ----
ggplot(me_dif_fe_intelec) +
  geom_histogram(data = interelect_lm_dif_fe$model, 
                 aes(x = interelect, y = ..count../sum(..count..)*1.8), 
                 bins = 50, color = "gray", fill = "white", alpha = 0.7, inherit.aes = F) + 
  geom_line(aes(x = interelect, y = dydx_leftright_den_rlog_lag), size = 0.5) +
  geom_line(aes(x = interelect, y = dydx_leftright_den_rlog_lag + 1.96*sqrt(Var_dydx_leftright_den_rlog_lag)), lty = "dashed", size = 0.3, alpha = 0.9) +
  geom_line(aes(x = interelect, y = dydx_leftright_den_rlog_lag - 1.96*sqrt(Var_dydx_leftright_den_rlog_lag)), lty = "dashed", size = 0.3, alpha = 0.9) +  
  geom_point(aes(x = interelect, y = dydx_leftright_den_rlog_lag), size = 2, shape = 18) +
  scale_linetype_manual(labels = c("Short (Q1)", "Long (Q3)"), values = c("solid", "dashed")) +
  labs(title = "CME of median voter certainty on main party distance", 
       subtitle = "By length of inter-election time (months), FE estimates",
       x = "Inter-Election Time (Months) ", y = "CME of Lagged Median Voter Position") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) 

# reset graphx ------------------------------------------------------------

theme_set(theme_base)

# clear env. --------------------------------------------------------------

rm(list = ls())

# END SCRIPT